{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义决策树的节点"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DecisionNode():\n",
    "    \"\"\"\n",
    "    决策树的决策节点或者是叶子节点\n",
    "    -----------------------\n",
    "    feature_index:代表所属特征的列号\n",
    "    threshold:判断的阈值\n",
    "    value:预测值|cm=mean(sum(y))\n",
    "    left:左子树--符合阈值条件\n",
    "    right:右子树--不符合调价\n",
    "    \"\"\"\n",
    "    def __init__(self,feature_index=None,threshold=None,value=None,left=None,right=None):\n",
    "        self.feature_index=feature_index\n",
    "        self.threshold=threshold\n",
    "        self.value=value\n",
    "        self.left=left\n",
    "        self.right=right\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 切分函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def divide(X,feature_index,threshold):#根据超二分类，进行数据集的二分切分\n",
    "    ##考虑到feature可能是连续性数据或者是离散型的数据，那么在切分特征的时候，切分函数是不一样的\n",
    "    ##连续性的数据根据x>=threshold来进行二分切分\n",
    "    ##离散型的数据根据x==threshold来进行二分切分\n",
    "    split=None\n",
    "    ##如果是连续性的数据\n",
    "    if isinstance(threshold,int) or isinstance(threshold,float):\n",
    "        split=lambda item:item[feature_index]>=threshold\n",
    "    else:\n",
    "        split=lambda item:item[feature_i]==threshold\n",
    "    X1=np.array([sample for sample in X if split(sample)])\n",
    "    X2=np.array([sample for sample in X if not split(sample)])\n",
    "    \n",
    "    return np.array([X1,X2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方差函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def variance(X):\n",
    "    mean=np.ones(np.shape(X))*X.mean(0)\n",
    "    N=np.shape(X)[0]\n",
    "    variance=(1/N)*np.diag((X-mean).T.dot(X-mean))\n",
    "    return variance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2.25, 2.25, 2.25])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "variance(np.asanyarray([[1,2,3],[4,5,6]]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 决策树框架"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DecisionTree(object):\n",
    "    \"\"\"\n",
    "    决策树的超类\n",
    "    \n",
    "    ---------------\n",
    "    min_samples_split:最小样本数量，当大于等于该数量的时候可以进行分裂，如果小于该数量则不进行分裂\n",
    "    \n",
    "    min_gain:分裂最小增益，当小于该增益的时候，不进行分裂\n",
    "    \n",
    "    max_depth:最大深度，当大于该深度的时候不进行分裂\n",
    "    \n",
    "    loss:function:Loss函数的Handle\n",
    "    \n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self,min_samples_split=2,min_gain=0.01,max_depth=100,loss=None):\n",
    "        self.root=None\n",
    "        self.min_samples_split=min_samples_split\n",
    "        self.min_gain=min_gain\n",
    "        self.max_depht=max_depth\n",
    "        self._gain=None\n",
    "        self._leaf_value=None\n",
    "        self.one_dim=None#(len(X),)->(Len(X),1)\n",
    "        self.loss=loss\n",
    "\n",
    "    def fit(self,X,y,loss=None):        \n",
    "        self.one_dim=len(np.shape(y))==1\n",
    "        self.root=self.build_tree(X,y)      \n",
    "        self.loss=None\n",
    "    \n",
    "    def build_tree(self,X,y,current_depth=0):\n",
    "        \"\"\"循环递归的进行决策树的构建，依据增益函数的值进行X的分裂\"\"\"\n",
    "        largest_gain=0\n",
    "        best_condition=None\n",
    "        best_sets=None\n",
    "        if len(np.shape(y))==1:\n",
    "            y=np.expand_dims(y,axis=1)#将(x,)的数组变成(x,1)的数组\n",
    "        Xy=np.concatenate((X,y),axis=1)#将y的值作为列合并到X中\n",
    "        n_samples,n_features=np.shape(X)#得到行数和列数\n",
    "        if n_samples>=self.min_samples_split :\n",
    "            #and current_depth<=self.max_depth:\n",
    "            for feature_i in range(n_features):#遍历所有的属性寻找最佳切分点\n",
    "                values = np.expand_dims(X[:, feature_i], axis=1)\n",
    "                unique_values=np.unique(values)#将所有的values进行去重\n",
    "                for threshold in unique_values:#遍历该feature_i所有可能的切分点，寻找最佳切分点\n",
    "                    Xy1,Xy2=divide(Xy,feature_i,threshold)\n",
    "                    if len(Xy1)>0 and len(Xy2)>0:\n",
    "                        y1=Xy1[:,n_features:]#最后的y值取出来\n",
    "                        y2=Xy2[:,n_features:]\n",
    "                        gain=self._gain(y,y1,y2)\n",
    "                        if gain>largest_gain:#如果比当前最大的增益还大，则保存当前记录\n",
    "                            largest_gain=gain\n",
    "                            best_condition={\"feature_index\":feature_i,\"threshold\":threshold}\n",
    "                            best_sets={\n",
    "                                \"leftX\":Xy1[:,:n_features],#左子树的X\n",
    "                                \"lefty\":Xy1[:,n_features:],\n",
    "                                \"rightX\":Xy2[:,:n_features],\n",
    "                                \"righty\":Xy2[:,n_features:]\n",
    "                            }\n",
    "        if largest_gain>=self.min_gain:#如果达到了分裂的最小增益，则进行分裂\n",
    "            left=self.build_tree(best_sets[\"leftX\"],best_sets[\"lefty\"],current_depth+1)\n",
    "            right=self.build_tree(best_sets[\"rightX\"],best_sets[\"righty\"],current_depth+1)\n",
    "            return DecisionNode(feature_index=best_condition['feature_index'],threshold=best_condition['threshold'],left=left,right=right)\n",
    "        leaf_value=self._leaf_value(y)\n",
    "        return DecisionNode(value=leaf_value)\n",
    "    \n",
    "    def predict(self,X):\n",
    "        \"\"\"\n",
    "        逐个遍历x in X，寻找相应的叶子节点\n",
    "        \"\"\"\n",
    "        y_pred=[self.predict_v(x) for x in X]\n",
    "        \n",
    "        return y_pred\n",
    "    \n",
    "    def predict_v(self,x,tree=None):\n",
    "        \"\"\"\n",
    "        递归的进行树的遍历\n",
    "        \"\"\"\n",
    "        if tree is None:\n",
    "            tree=self.root\n",
    "        if tree.value is not None:\n",
    "            return tree.value\n",
    "        feature_value=x[tree.feature_index]#取出需要进行判定的列\n",
    "        \n",
    "        branch=tree.right\n",
    "        #如果是数值类型的feature，则使用>=来进行判断\n",
    "        if isinstance(feature_value,int) or isinstance(feature_value,float):\n",
    "            if feature_value>=tree.threshold:\n",
    "                branch=tree.left\n",
    "        elif feature_value==tree.threshold:#如果是离散型的数值，则使用==来进行判断\n",
    "            branch=tree.left\n",
    "        \n",
    "        return self.predict_v(x,branch)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 回归树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "class RegressionTree(DecisionTree):\n",
    "    \n",
    "    def _calculate_variance_gain(self,y,y1,y2):\n",
    "        var_total=variance(y)\n",
    "        var_1=variance(y1)\n",
    "        var_2=variance(y2)\n",
    "        p1=len(y1)/len(y)\n",
    "        p2=len(y2)/len(y)\n",
    "        var_gain=var_total-(p1*var_1+p2*var_2)\n",
    "        return sum(var_gain)\n",
    "    def _mean(self,y):\n",
    "        value=np.mean(y,axis=0)\n",
    "        return value if len(value) > 1 else value[0]\n",
    "    def fit(self,X,y):\n",
    "        self._gain=self._calculate_variance_gain\n",
    "        self._leaf_value=self._mean\n",
    "        super(RegressionTree,self).fit(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=RegressionTree()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model.max_depht"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用竞赛数据进行预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "train=pd.read_csv('datas/house_data.csv')\n",
    "y=train['SalePrice']\n",
    "train1=train.drop(['Id','SalePrice'],axis=1)\n",
    "X=pd.get_dummies(train1).reset_index(drop=True)\n",
    "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=RegressionTree()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred=model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "logrmse=np.sqrt(mean_squared_error(np.log(abs(y_test)),np.log(abs(np.asarray(pred)))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.21581984642359023"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "logrmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 作业：\n",
    "\n",
    "使用自己写的回归树，跑项目的数据，通过调整超参来得到最优值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "#np.expand_dims"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model.root.left.left.left.left.value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4rc1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
